knitr::opts_knit$set(root.dir = "/home/rstudio/NEOPLASY_PRIMATES")
knitr::opts_chunk$set(engine.path = list(
python = '/home/rstudio/NEOPLASY_PRIMATES/TreeCluster/venv/bin/python'))
getwd()
## [1] "/home/rstudio/NEOPLASY_PRIMATES/Scripts"
Directory definition
workingDir <- getwd()
dataDir <- file.path(workingDir, "Data")
resultsDir <- file.path(workingDir,"Out")
getwd()
## [1] "/home/rstudio/NEOPLASY_PRIMATES"
Seed and library loading
set.seed(1998)
# General use libraries
library(ape)
library(RRphylo)
library(phytools)
library(reticulate)
library(dplyr)
# Non-concordant species retrieval
library(stringdist)
library(taxize)
library(R.utils)
# Sort distances chunk
library(adephylo)
library(castor)
# Cluster merging
library(dplyr)
# Plotting
library(ggplot2)
library(tidyr)
library(dplyr)
library(viridis)
library(ggtree)
library(RColorBrewer)
library(svglite)
# From Alejandro
library(e1071)
# From Noelia
library(tidyverse)
library(ggpubr)
library(FactoMineR)
library(factoextra)
library(corrplot)
# library(GGally)
library(cowplot)
library(ape)
library(nlme)
# library(gt)
library(ggtree)
library(phytools)
library(RRphylo)
library(picante)
library(ggnewscale)
library(reshape)
library(palettetown)
library(compositions)
library(gt)
Tree import and prunning
primates_tree <- read.newick(file.path(dataDir,"233-GENOMES/science.abn7829_data_s4.nex.tree"))
tree_species <- primates_tree$tip.label
cancer_traits <- read.csv(file.path(dataDir, "Neoplasia/species360_primates_neoplasia_20230519.csv"), sep = ",", header = TRUE)
# Remove whitespaces
cancer_traits[] <- lapply(cancer_traits, function(col) trimws(col))
# binarize
cancer_traits$label <- gsub(" ", "_", cancer_traits$species)
# Reorder and filter species
cancer_traits <- cancer_traits %>%
select(family,species,label,common_name,necropsy_count,neoplasia_necropsy,neoplasia_prevalence, everything()) %>%
filter(!is.na(neoplasia_prevalence))
# Prune the tree as Noelia does
common_names <- intersect(primates_tree$tip.label, cancer_traits$label)
pruned_primates.tree <- drop.tip(primates_tree, setdiff(primates_tree$tip.label, common_names))
cancer_traits <- cancer_traits %>%
mutate(ordering = match(label, pruned_primates.tree$tip.label)) %>%
arrange(ordering) %>%
filter(!is.na(ordering))
# Outlier
cancer_traits_filt <- cancer_traits %>%
select(-common_name, -ordering, -species)
cancer_traits_filt <- cancer_traits_filt %>%
mutate(across(c(necropsy_count, neoplasia_necropsy, neoplasia_prevalence), as.double))
means <- t(cancer_traits_filt[,-which(colnames(cancer_traits_filt) %in% c("family", "label"))] %>%
summarize_all(., mean))
medians <- t(cancer_traits_filt[,-which(colnames(cancer_traits_filt) %in% c("family", "label"))] %>%
summarize_all(., median))
sds <- t(cancer_traits_filt[,-which(colnames(cancer_traits_filt) %in% c("family", "label"))] %>%
summarize_all(., list(sd)))
# MELT the df (longize it)
melted_dataframe_traits <- melt(cancer_traits_filt, id.vars = c("family", "label"), measure.vars = c("necropsy_count", "neoplasia_necropsy", "neoplasia_prevalence"))
colnames(melted_dataframe_traits) <- c("Family", "Species", "Trait", "value")
# Add stats to the melted df
for (i in 1:length(melted_dataframe_traits$Species)){
if(as.character(melted_dataframe_traits$Trait[i]) %in% rownames(medians)){
melted_dataframe_traits$median[i] <- as.numeric(as.character(medians[rownames(medians) == melted_dataframe_traits$Trait[i]]))
melted_dataframe_traits$mean[i] <- as.numeric(as.character(means[rownames(means) == melted_dataframe_traits$Trait[i]]))
melted_dataframe_traits$sd[i] <- as.numeric(as.character(sds[rownames(sds) == melted_dataframe_traits$Trait[i]]))
}}
calculate_stat <- function(stat_fn) {
lapply(unique(cancer_traits_filt$family), function(fam) {
subset_df <- cancer_traits_filt[cancer_traits_filt$family == fam, ]
stats <- subset_df[,-which(colnames(subset_df) %in% c("label", "family"))] %>%
summarize_all(stat_fn)
data.frame(family = fam, stats)
})
}
fam_means_list <- calculate_stat(mean)
fam_medians_list <- calculate_stat(median)
fam_sds_list <- calculate_stat(sd)
bind_rows_df <- function(list_data) {
do.call(rbind, list_data)
}
fam_means <- bind_rows_df(fam_means_list)
fam_medians <- bind_rows_df(fam_medians_list)
fam_sds <- bind_rows_df(fam_sds_list)
for (i in 1:length(melted_dataframe_traits$Species)){
current_family <- melted_dataframe_traits$Family[i]
current_trait <- as.character(melted_dataframe_traits$Trait[i])
if(current_trait %in% colnames(fam_means)){
melted_dataframe_traits$mean_fam[i] <- fam_means[fam_means$family == current_family, current_trait]
melted_dataframe_traits$median_fam[i] <- fam_medians[fam_medians$family == current_family, current_trait]
melted_dataframe_traits$sd_fam[i] <- fam_sds[fam_sds$family == current_family, current_trait]
}
}
# Compute top bottom definitions
# global
melted_dataframe_traits <- melted_dataframe_traits %>%
dplyr::group_by(Trait) %>%
mutate(label = case_when(
value < median - sd ~ "low_extreme",
value > median + sd ~ "high_extreme",
TRUE ~ "normal")) %>%
mutate(outlier = case_when(
value < quantile(value, 0.25,na.rm=TRUE) - 1.5 * IQR(value, na.rm=TRUE) ~ paste0("low_outlier_",Trait),
value > quantile(value, 0.75,na.rm=TRUE) + 1.5 * IQR(value, na.rm=TRUE) ~ paste0("high_outlier_",Trait),
TRUE ~ "no"))
# global 3sd calculation
melted_dataframe_traits <- melted_dataframe_traits %>%
dplyr::group_by(Trait) %>%
mutate(label_2sd = case_when(
value < median - 2*sd ~ "very_low_extreme",
value > median + 2*sd ~ "very_high_extreme",
TRUE ~ "normal"))
# by family
melted_dataframe_traits <- melted_dataframe_traits %>%
dplyr::group_by(Family, Trait) %>%
mutate(z_score_fam = (value-mean)/sd) %>%
mutate(label_family = case_when(
value < median_fam - sd_fam ~ "low_extreme",
value > median_fam + sd_fam ~ "high_extreme",
TRUE ~ "normal"))
# family 2sd calculation
melted_dataframe_traits <- melted_dataframe_traits %>%
dplyr::group_by(Family, Trait) %>%
mutate(z_score_fam_2sd = (value-mean)/(2*sd)) %>%
mutate(label_family_2sd = case_when(
value < median_fam - 2*sd_fam ~ "very_low_extreme",
value > median_fam + 2*sd_fam ~ "very_high_extreme",
TRUE ~ "normal"))
spread_df <- melted_dataframe_traits %>%
spread(Trait, value)
# Filter dataframe for x and y values
df_x <- melted_dataframe_traits[melted_dataframe_traits$Trait == "necropsy_count", ]
df_y <- melted_dataframe_traits[melted_dataframe_traits$Trait == "neoplasia_necropsy", ]
# Merge them based on a common identifier (assuming you have an 'ID' column)
merged_df <- merge(df_x, df_y, by=c("Species","Family"))
# Map_plot
# Create a scatterplot
scatter_plot <- ggplot(data = merged_df,
aes(x = value.x, y = value.y, color = Family)) +
geom_point(data = subset(merged_df,
!(outlier.y %in% c("high_outlier_neoplasia_necropsy", "low_outlier_neoplasia_necropsy"))),
size = 4) +
# Plot outliers for y variable with a different shape
geom_point(data = subset(merged_df, outlier.y %in% c("high_outlier_neoplasia_necropsy", "low_outlier_neoplasia_necropsy")),
aes(shape = outlier.y), size = 4) +
scale_shape_manual(values = c("high_outlier_neoplasia_necropsy" = 17, "low_outlier_neoplasia_necropsy" = 17)) +
# Label species that are outliers for y variable
geom_text(data = subset(merged_df, outlier.y %in% c("high_outlier_neoplasia_necropsy", "low_outlier_neoplasia_necropsy")),
aes(label = Species), nudge_y = 1.5, size = 4, vjust = "right", hjust = "top") +
labs(x = "Necropsy Count", y = "Neoplasia/Necropsy") +
theme_minimal() +
geom_smooth(method = "lm", se = TRUE, color="black") +
theme(
axis.title.x = element_text(size = 10, hjust = 0.5, margin = margin(t = 20, b = 10)),
axis.title.y = element_text(size = 10, angle = 90, hjust = 0.5, margin = margin(t = 20, b = 10)),
legend.position = "none")
print(scatter_plot)
## `geom_smooth()` using formula = 'y ~ x'

model <- lm(value.y ~ value.x, data = merged_df)
summary(model)
##
## Call:
## lm(formula = value.y ~ value.x, data = merged_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.224 -3.196 -0.775 2.660 10.929
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.197603 0.997605 -0.198 0.844
## value.x 0.078227 0.009278 8.432 3.09e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.746 on 51 degrees of freedom
## Multiple R-squared: 0.5823, Adjusted R-squared: 0.5741
## F-statistic: 71.09 on 1 and 51 DF, p-value: 3.093e-11
Top-to-bottom plots
Median - 1SD
df_z <- melted_dataframe_traits[melted_dataframe_traits$Trait == "neoplasia_prevalence", ]
df_z$median_fam <- factor(df_z$median_fam, levels = unique(df_z$median_fam))
# With outliers
ggplot(data = df_z,
aes(x = value,
y = Family)) +
geom_point(data = subset(df_z,
!(label_family %in% c("low_extreme", "normal", "high_extreme"))),
size = 4) +
geom_point(data = subset(df_z, label_family %in% c("low_extreme", "normal", "high_extreme")),
aes(shape = label_family, color = Family), size = 4) +
scale_shape_manual(values = c("low_extreme" = 22, "normal" = 21, "high_extreme" = 24)) +
stat_summary(fun.x = median, geom = "errorbar",
aes(xmax = ..x.., xmin = ..x..), linewidth = 1, color = pokepal("rayquaza", spread = 1)) +
geom_vline(aes(xintercept = median), linetype = "dotted", color = pokepal("rayquaza", spread = 1)) +
labs(x = "Neoplasia Prevalence", y = "Families") +
theme(
axis.title.x = element_text(size = 10, hjust = 0.5, margin = margin(t = 20, b = 10)),
axis.title.y = element_text(size = 10, angle = 90, hjust = 0.5, margin = margin(t = 20, b = 20)),
legend.position = "none") +
xlim(0,0.5)
## Warning in stat_summary(fun.x = median, geom = "errorbar", aes(xmax = ..x.., :
## Ignoring unknown parameters: `fun.x`
## Warning: The dot-dot notation (`..x..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(x)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## No summary function supplied, defaulting to `mean_se()`

Median - 1SD without outliers
# Without outliers
df_z_nooutlier <- df_z[df_z$outlier == "no", ]
ggplot(data = df_z_nooutlier,
aes(x = value,
y = Family)) +
geom_point(data = subset(df_z_nooutlier,
!(label_family %in% c("low_extreme", "normal", "high_extreme"))),
size = 4) +
geom_point(data = subset(df_z_nooutlier, label_family %in% c("low_extreme", "normal", "high_extreme")),
aes(shape = label_family, color = Family), size = 4) +
scale_shape_manual(values = c("low_extreme" = 22, "normal" = 21, "high_extreme" = 24)) +
stat_summary(fun.x = median, geom = "errorbar",
aes(xmax = ..x.., xmin = ..x..), linewidth = 1, color = pokepal("rayquaza", spread = 1)) +
geom_vline(aes(xintercept = median), linetype = "dotted", color = pokepal("rayquaza", spread = 1)) +
labs(x = "Neoplasia Prevalence", y = "Families") +
theme(
axis.title.x = element_text(size = 10, hjust = 0.5, margin = margin(t = 20, b = 10)),
axis.title.y = element_text(size = 10, angle = 90, hjust = 0.5, margin = margin(t = 20, b = 20)),
legend.position = "none") +
xlim(0,0.25)
## Warning in stat_summary(fun.x = median, geom = "errorbar", aes(xmax = ..x.., :
## Ignoring unknown parameters: `fun.x`
## No summary function supplied, defaulting to `mean_se()`

Median - 2SD
ggplot(data = df_z,
aes(x = value,
y = Family)) +
geom_point(data = subset(df_z,
!(label_family_2sd %in% c("very_low_extreme", "normal", "very_high_extreme"))),
size = 4) +
geom_point(data = subset(df_z, label_family_2sd %in% c("very_low_extreme", "normal", "very_high_extreme")),
aes(shape = label_family_2sd, color = Family), size = 4) +
scale_shape_manual(values = c("very_low_extreme" = 22, "normal" = 21, "very_high_extreme" = 24)) +
stat_summary(fun.x = median, geom = "errorbar",
aes(xmax = ..x.., xmin = ..x..), linewidth = 1, color = pokepal("rayquaza", spread = 1)) +
geom_vline(aes(xintercept = median), linetype = "dotted", color = pokepal("rayquaza", spread = 1)) +
labs(x = "Neoplasia Prevalence", y = "Families") +
theme(
axis.title.x = element_text(size = 10, hjust = 0.5, margin = margin(t = 20, b = 10)),
axis.title.y = element_text(size = 10, angle = 90, hjust = 0.5, margin = margin(t = 20, b = 20)),
legend.position = "none") +
xlim(0,0.5)
## Warning in stat_summary(fun.x = median, geom = "errorbar", aes(xmax = ..x.., :
## Ignoring unknown parameters: `fun.x`
## No summary function supplied, defaulting to `mean_se()`

Median - 2SD without outliers
ggplot(data = df_z_nooutlier,
aes(x = value,
y = Family)) +
geom_point(data = subset(df_z_nooutlier,
!(label_family_2sd %in% c("very_low_extreme", "normal", "very_high_extreme"))),
size = 4) +
geom_point(data = subset(df_z_nooutlier, label_family_2sd %in% c("very_low_extreme", "normal", "very_high_extreme")),
aes(shape = label_family_2sd, color = Family), size = 4) +
scale_shape_manual(values = c("very_low_extreme" = 22, "normal" = 21, "very_high_extreme" = 24)) +
stat_summary(fun.x = median, geom = "errorbar",
aes(xmax = ..x.., xmin = ..x..), linewidth = 1, color = pokepal("rayquaza", spread = 1)) +
geom_vline(aes(xintercept = median), linetype = "dotted", color = pokepal("rayquaza", spread = 1)) +
labs(x = "Neoplasia Prevalence", y = "Families") +
theme(
axis.title.x = element_text(size = 10, hjust = 0.5, margin = margin(t = 20, b = 10)),
axis.title.y = element_text(size = 10, angle = 90, hjust = 0.5, margin = margin(t = 20, b = 20)),
legend.position = "none") +
xlim(0,0.25)
## Warning in stat_summary(fun.x = median, geom = "errorbar", aes(xmax = ..x.., :
## Ignoring unknown parameters: `fun.x`
## No summary function supplied, defaulting to `mean_se()`

Histogram visualization for the three traits
hist(cancer_traits_filt$necropsy_count, xlab = "Necropsy Count", main = "Histogram of primates Necropsy Count")

hist(cancer_traits_filt$neoplasia_necropsy, xlab = "Neoplasia Count", main = "Histogram of primates Neoplasia Count")

hist(cancer_traits_filt$neoplasia_prevalence, xlab = "Neoplasia prevalence", main = "Histogram of primates Neoplasia Prevalence")

Visual distribution of the three traits along the phylogeny
Outlier highlight
tree_species <- pruned_primates.tree$tip.label
NC <- cancer_traits_filt$necropsy_count
names(NC) <- tree_species
NC <- NC[!is.na(names(NC))]
NN <- cancer_traits_filt$neoplasia_necropsy
names(NN) <- tree_species
NN <- NN[!is.na(names(NN))]
NP <- cancer_traits_filt$neoplasia_prevalence
names(NP) <- tree_species
NP <- NP[!is.na(names(NP))]
# Necropsy Count
obj_nc<-contMap(pruned_primates.tree,NC,plot=FALSE)
obj_nc<-setMap(obj_nc,c("#077223", "#53EC2D", "white", "#F1F97C",
"#F08431", "#F03131"))
plot(obj_nc,fsize=c(1),lwd=c(5),leg.txt="Necropsy Count")

# Neoplasia Count
obj_nn<-contMap(pruned_primates.tree,NN,plot=FALSE)
obj_nn<-setMap(obj_nn,c("#077223", "#53EC2D", "white", "#F1F97C",
"#F08431", "#F03131"))
plot(obj_nn,fsize=c(1),lwd=c(5),leg.txt="Neoplasia Count")

# Neoplasia_prevalence
obj_np<-contMap(pruned_primates.tree,NP,plot=FALSE)
obj_np<-setMap(obj_np,c("#077223", "#53EC2D", "white", "#F1F97C",
"#F08431", "#F03131"))
plot(obj_np,fsize=c(1),lwd=c(5),leg.txt="Neoplasia Prevalence")

Scatterplots
ggplot(cancer_traits_filt, aes(x=necropsy_count, y=neoplasia_necropsy, color=family))+
geom_point() +
geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
labs(y = "Neoplasia count", x = "Necropsy count", fill = "Neoplasia/Necropsy") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

ggplot(cancer_traits_filt, aes(x=normalize(necropsy_count), y=normalize(neoplasia_necropsy), color=family))+
geom_point() +
geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
labs(y = "Neoplasia count", x = "Necropsy count", fill = "Neoplasia/Necropsy") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Summary table
# Clean and Summarize the Data
summary_data <- cancer_traits_filt %>%
group_by(family) %>%
# Summarize data
summarize(
n = n(),
NC_Mean = mean(necropsy_count, na.rm = TRUE),
NC_Median = median(necropsy_count, na.rm = TRUE),
NC_SD = sd(necropsy_count, na.rm = TRUE),
min_NC = min(necropsy_count, na.rm = TRUE),
max_NC = max(necropsy_count, na.rm = TRUE),
NN_Mean = mean(neoplasia_necropsy, na.rm = TRUE),
NN_Median = median(neoplasia_necropsy, na.rm = TRUE),
NN_SD = sd(neoplasia_necropsy, na.rm = TRUE),
min_NN = min(neoplasia_necropsy, na.rm = TRUE),
max_NN = max(neoplasia_necropsy, na.rm = TRUE),
NP_Mean = mean(neoplasia_prevalence, na.rm = TRUE),
NP_Median = median(neoplasia_prevalence, na.rm = TRUE),
NP_SD = sd(neoplasia_prevalence, na.rm = TRUE),
min_NP = min(neoplasia_prevalence, na.rm = TRUE),
max_NP = max(neoplasia_prevalence, na.rm = TRUE)) %>%
# Compute the MLS Limits
mutate(NC_Limits = paste(min_NC, "-", max_NC),
NN_Limits = paste(min_NN, "-",max_NN),
NP_Limits = paste(min_NP, "-",max_NP))
summary_data
## # A tibble: 12 × 20
## family n NC_Mean NC_Median NC_SD min_NC max_NC NN_Mean NN_Median NN_SD
## <chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Aotidae 1 26 26 NA 26 26 0 0 NA
## 2 Atelidae 2 34.5 34.5 7.78 29 40 8.5 8.5 6.36
## 3 Callitr… 14 112. 80.5 74.8 41 309 5.57 2 7.84
## 4 Cebidae 2 115 115 109. 38 192 7 7 5.66
## 5 Cercopi… 16 64.8 56 39.4 21 158 2.88 2 2.66
## 6 Cheirog… 2 44 44 24.0 27 61 8.5 8.5 7.78
## 7 Galagid… 2 65 65 26.9 46 84 2 2 1.41
## 8 Hominid… 4 65 61.5 44.7 24 113 8.5 8.5 7.59
## 9 Hylobat… 1 58 58 NA 58 58 7 7 NA
## 10 Indriid… 1 50 50 NA 50 50 4 4 NA
## 11 Lemurid… 6 115. 57.5 144. 28 399 15.2 10 11.7
## 12 Lorisid… 2 52 52 35.4 27 77 7.5 7.5 7.78
## # ℹ 10 more variables: min_NN <dbl>, max_NN <dbl>, NP_Mean <dbl>,
## # NP_Median <dbl>, NP_SD <dbl>, min_NP <dbl>, max_NP <dbl>, NC_Limits <chr>,
## # NN_Limits <chr>, NP_Limits <chr>
# Use the gt function to create the table
table_display <- summary_data %>%
select(family,n,NC_Mean,NC_Median,NC_SD,NC_Limits,
NN_Mean,NN_Median,NN_SD,NN_Limits,
NP_Mean,NP_Median,NP_SD,NP_Limits) %>%
gt() %>%
tab_header(title = "Table. Mean NC, NN and NP Computed Using All Records from our Primate Database") %>%
# Formatting columns for better display
fmt_number(columns = c("NC_Mean", "NC_Median", "NC_SD", "NN_Mean", "NN_Median", "NN_SD", "NP_Mean", "NP_Median", "NP_SD"), decimals = 2) %>%
# Adding and removing table lines
tab_options(
table.border.top.style = "none", # Remove the top border
table.border.bottom.style = "solid",
table.border.bottom.width = 2,
column_labels.border.top.style = "double",
column_labels.border.bottom.style = "double")
# Display the table
table_display
| Table. Mean NC, NN and NP Computed Using All Records from our Primate Database |
| family |
n |
NC_Mean |
NC_Median |
NC_SD |
NC_Limits |
NN_Mean |
NN_Median |
NN_SD |
NN_Limits |
NP_Mean |
NP_Median |
NP_SD |
NP_Limits |
| Aotidae |
1 |
26.00 |
26.00 |
NA |
26 - 26 |
0.00 |
0.00 |
NA |
0 - 0 |
0.00 |
0.00 |
NA |
0 - 0 |
| Atelidae |
2 |
34.50 |
34.50 |
7.78 |
29 - 40 |
8.50 |
8.50 |
6.36 |
4 - 13 |
0.27 |
0.27 |
0.25 |
0.1 - 0.448275862068966 |
| Callitrichidae |
14 |
112.50 |
80.50 |
74.75 |
41 - 309 |
5.57 |
2.00 |
7.84 |
0 - 27 |
0.04 |
0.03 |
0.04 |
0 - 0.105590062111801 |
| Cebidae |
2 |
115.00 |
115.00 |
108.89 |
38 - 192 |
7.00 |
7.00 |
5.66 |
3 - 11 |
0.07 |
0.07 |
0.02 |
0.0572916666666667 - 0.0789473684210526 |
| Cercopithecidae |
16 |
64.75 |
56.00 |
39.41 |
21 - 158 |
2.88 |
2.00 |
2.66 |
0 - 10 |
0.04 |
0.04 |
0.03 |
0 - 0.106382978723404 |
| Cheirogaleidae |
2 |
44.00 |
44.00 |
24.04 |
27 - 61 |
8.50 |
8.50 |
7.78 |
3 - 14 |
0.17 |
0.17 |
0.08 |
0.111111111111111 - 0.229508196721311 |
| Galagidae |
2 |
65.00 |
65.00 |
26.87 |
46 - 84 |
2.00 |
2.00 |
1.41 |
1 - 3 |
0.04 |
0.04 |
0.04 |
0.0119047619047619 - 0.0652173913043478 |
| Hominidae |
4 |
65.00 |
61.50 |
44.70 |
24 - 113 |
8.50 |
8.50 |
7.59 |
1 - 16 |
0.11 |
0.13 |
0.05 |
0.0333333333333333 - 0.150537634408602 |
| Hylobatidae |
1 |
58.00 |
58.00 |
NA |
58 - 58 |
7.00 |
7.00 |
NA |
7 - 7 |
0.12 |
0.12 |
NA |
0.120689655172414 - 0.120689655172414 |
| Indriidae |
1 |
50.00 |
50.00 |
NA |
50 - 50 |
4.00 |
4.00 |
NA |
4 - 4 |
0.08 |
0.08 |
NA |
0.08 - 0.08 |
| Lemuridae |
6 |
114.67 |
57.50 |
143.66 |
28 - 399 |
15.17 |
10.00 |
11.70 |
7 - 38 |
0.20 |
0.18 |
0.09 |
0.0952380952380952 - 0.321428571428571 |
| Lorisidae |
2 |
52.00 |
52.00 |
35.36 |
27 - 77 |
7.50 |
7.50 |
7.78 |
2 - 13 |
0.12 |
0.12 |
0.07 |
0.0740740740740741 - 0.168831168831169 |
#gtsave(table_display, filename = file.path(resultsDir,"descriptive/primates_table.png"))
top1SD_spp <- df_z_nooutlier %>%
filter(label_family == "high_extreme") %>%
select(label_family,Species,value)
## Adding missing grouping variables: `Family`, `Trait`
bottom1SD_spp <- df_z_nooutlier %>%
filter(label_family == "low_extreme") %>%
select(label_family,Species,value)
## Adding missing grouping variables: `Family`, `Trait`
# family_names <- unique(cancer_traits_filt$family)
#
# get_uid <- function(family_name) {
# # Try fetching UID for species
# uid <- tryCatch({
# ggimage::phylopic_uid(family_name)
# }, error = function(e) {
# return(NULL) # return NULL on error
# })
# return(uid)
# }
#
# for uid in
#
# fam_uids <- sapply(family_names, get_uid)
#
# header_images <- paste0('<img src="', primate_images, '" height="50px">') %>%
# paste(collapse = " ")
#
# # Load the ggplot2 library
# library(ggplot2)
#
# # Combine the data
# combined_data <- rbind(
# transform(top1SD_spp, category = "Top Species"),
# transform(bottom1SD_spp, category = "Bottom Species")
# )
#
# combined_data$Species <- gsub("_", " ", combined_data$Species)
#
# # Load necessary libraries
# library(gt)
#
# combined_data %>%
# gt() %>%
# tab_header(title = “PRIMATES NEOPLASY ANALSYS”) %>%
# cols_label(
# Species = 'Species',
# neoplasia_prevalence = 'Neoplasia Prevalence') %>%
# tab_spanner(
# label = md('**Top Species**') %>%
# tab_style(
# style = list(cell_fill(color = “#b2f7ef”),
# cell_text(weight = “bold”)),
# locations = cells_body(columns = mpg))%>%
# tab_style(
# style = list(cell_fill(color = “#ffefb5”),
# cell_text(weight = “bold”)),
# locations = cells_body(columns = hp))
Continous variable analysis
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
##
## Attaching package: 'GGally'
## The following object is masked from 'package:R.utils':
##
## wrap
## The following object is masked from 'package:emmeans':
##
## pigs
# Check continuous variables distribution
p <- cancer_traits_filt %>%
select(necropsy_count, neoplasia_necropsy, neoplasia_prevalence) %>%
ggpairs()
# Select 2 variables, remove missing variables and calculate spearman correlation from them
cor_numer <- cancer_traits_filt %>%
select(necropsy_count, neoplasia_necropsy, neoplasia_prevalence) %>% # select_if(is.numeric)
drop_na() %>%
cor(., method = "spearman") #,use = "pairwise.complete.obs" )
# Correlation plot
corrplot.mixed(cor_numer, number.digits = 2, tl.col = "black") # Set the color of text labels for correlation numbers
